In [1]:
import pandas as pd
import numpy as np
import os
import datetime

import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
import seaborn as sns

from sklearn import tree
from sklearn import ensemble

import itertools

from sklearn import metrics
from sklearn import model_selection

import pvlib
import cs_detection
import utils
import visualize_plotly as visualize
sns.set_style("white")

matplotlib.rcParams['figure.figsize'] = (20., 8.)

from IPython.display import Image

from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objs as go
import cufflinks as cf
cf.go_offline()
init_notebook_mode(connected=True)

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib inline

%config InlineBackend.figure_format = 'retina'

matplotlib.rcParams.update({'font.size': 16})

import warnings
warnings.filterwarnings(action='ignore')

import xgboost as xgb


plt.close('all')

import pygal

Train on default data

In [2]:
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
In [3]:
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
In [4]:
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
In [5]:
clf = train_obj.fit_model(clf)
In [6]:
pred = test_obj.predict(clf)
In [7]:
print(metrics.accuracy_score(test_obj.df['sky_status'], pred))
0.943607305936
In [8]:
print(metrics.recall_score(test_obj.df['sky_status'], pred))
0.891294741494
In [9]:
print(metrics.precision_score(test_obj.df['sky_status'], pred))
0.890507726269
In [10]:
print(metrics.f1_score(test_obj.df['sky_status'], pred))
0.890901060071
In [11]:
cm = metrics.confusion_matrix(test_obj.df['sky_status'], pred)
In [12]:
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))
Out[12]:
<matplotlib.axes._subplots.AxesSubplot at 0x1129c25c0>
In [13]:
fig, ax = plt.subplots(figsize=(12, 8))

_ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
_ = ax.set_xticks(range(len(clf.feature_importances_)))
_ = ax.set_xticklabels(test_obj.features_, rotation=45)

_ = ax.set_ylabel('Importance')
_ = ax.set_xlabel('Feature')

_ = fig.tight_layout()
In [14]:
fig, ax = plt.subplots(figsize=(12, 8))

nsrdb_mask = test_obj.df['sky_status'].values

ax.plot(test_obj.df.index, test_obj.df['GHI'], label='GHI', alpha=.5)
ax.plot(test_obj.df.index, test_obj.df['Clearsky GHI pvlib'], label='GHIcs', alpha=.5)
ax.scatter(test_obj.df[pred & ~nsrdb_mask].index, test_obj.df[pred & ~nsrdb_mask]['GHI'], label='RF only', zorder=10, s=10)
ax.scatter(test_obj.df[nsrdb_mask & ~pred].index, test_obj.df[nsrdb_mask & ~pred]['GHI'], label='NSRDB only', zorder=10, s=10)
ax.scatter(test_obj.df[nsrdb_mask & pred].index, test_obj.df[nsrdb_mask & pred]['GHI'], label='Both', zorder=10, s=10)
_ = ax.legend(bbox_to_anchor=(1.25, 1))
In [15]:
vis = visualize.Visualizer()
vis.plot_corr_matrix(test_obj.df[test_obj.features_].corr().values, test_obj.features_)
test_obj.df[['GHI', 'GHI mean', 'GHI gauss10 mean', 'GHI gauss1 mean', 'GHI gauss.1 mean', 'GHI gauss.5 mean', 'GHI gauss.01 mean']].iplot()
In [ ]:
 

Train on 'cleaned' data

Clean data by setting some cutoffs (by hand). For this study, GHI/GHIcs mean has to be >= .9 and the coefficient of variance must be less than .1. Also need to limit GHI-GHIcs (due to low irradiance periods) to

In [16]:
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
In [17]:
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
In [18]:
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
In [19]:
clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
In [20]:
pred = test_obj.predict(clf)
In [21]:
test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
In [22]:
print(metrics.accuracy_score(test_obj.df[test_obj.df['quality_mask']]['sky_status'], pred[test_obj.df['quality_mask']]))
0.996222342148
In [23]:
print(metrics.recall_score(test_obj.df[test_obj.df['quality_mask']]['sky_status'], pred[test_obj.df['quality_mask']]))
0.994697304463
In [24]:
print(metrics.precision_score(test_obj.df[test_obj.df['quality_mask']]['sky_status'], pred[test_obj.df['quality_mask']]))
0.999112294718
In [25]:
print(metrics.f1_score(test_obj.df[test_obj.df['quality_mask']]['sky_status'], pred[test_obj.df['quality_mask']]))
0.996899911426
In [26]:
cm = metrics.confusion_matrix(test_obj.df[test_obj.df['quality_mask']]['sky_status'], pred[test_obj.df['quality_mask']])
In [27]:
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))
Out[27]:
<matplotlib.axes._subplots.AxesSubplot at 0x1139515f8>
In [28]:
fig, ax = plt.subplots(figsize=(12, 8))

_ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
_ = ax.set_xticks(range(len(clf.feature_importances_)))
_ = ax.set_xticklabels(test_obj.features_, rotation=45)

_ = ax.set_ylabel('Importance')
_ = ax.set_xlabel('Feature')

_ = fig.tight_layout()
fig, ax = plt.subplots(figsize=(24, 8)) nsrdb_mask = test_obj.df['sky_status'].values ax.plot(test_obj.df.index, test_obj.df['GHI'], label='GHI', alpha=.5) ax.plot(test_obj.df.index, test_obj.df['Clearsky GHI pvlib'], label='GHIcs', alpha=.5) ax.scatter(test_obj.df[nsrdb_mask & ~pred].index, test_obj.df[nsrdb_mask & ~pred]['GHI'], label='NSRDB only', zorder=10, s=50) ax.scatter(test_obj.df[pred & ~nsrdb_mask].index, test_obj.df[pred & ~nsrdb_mask]['GHI'], label='RF only', zorder=10, s=50) ax.scatter(test_obj.df[nsrdb_mask & pred].index, test_obj.df[nsrdb_mask & pred]['GHI'], label='Both', zorder=10, s=50) _ = ax.legend(bbox_to_anchor=(1.25, 1)) _ = ax.set_xlabel('Date') _ = ax.set_ylabel('GHI / Wm$^{-2}$')
In [29]:
# fig, ax = plt.subplots(figsize=(12, 8))

nsrdb_mask = test_obj.df['sky_status'].values

trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
# _ = ax.legend(bbox_to_anchor=(1.25, 1))

# _ = ax.set_xlabel('Date')
# _ = ax.set_ylabel('GHI / Wm$^{-2}$')
iplot([trace1, trace2, trace3, trace4, trace5])
In [30]:
vis = visualize.Visualizer()
vis.plot_corr_matrix(test_obj.df[test_obj.features_].corr().values, test_obj.features_)

Feature importance and correlation investigation

The previous two sections show promising results going from default to cleaned data. We see marked improvements in scoring and the confusion matrix. The features used in both cases are highly correlated, though. Here we will look at feature importances in a different way - by removing 1 at a time and recalculating the F1 score.

In [31]:
features = [
    # 'GHILL-GHIcsLL',
    'GHI-GHIcs mean',
    'GHI-GHIcs std',
    'dGHI-dGHIcs mean',
    # 'dGHI-dGHIcs std',
    'abs(t-tnoon)']
In [32]:
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
train_obj.trim_dates(None, '01-01-2015')
train_obj.features_ = features
test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
test_obj.trim_dates('01-01-2015', None)
test_obj.features_ = features
clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
pred = test_obj.predict(clf)
test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
f1_score = metrics.f1_score(test_obj.get_masked_df()['sky_status'], pred[test_obj.df['quality_mask']])
print('Full features'.format([i for i in features]))
print('    F1: {}'.format(f1_score))
vis = visualize.Visualizer()
vis.plot_corr_matrix(test_obj.df[test_obj.features_].corr().values, test_obj.features_)


# for i in range(3, len(features) - 1):
#     for feat_set in itertools.combinations(features, i):
#         detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status')
#         detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
#         train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
#         train_obj.trim_dates(None, '01-01-2015')
#         train_obj.features_ = list(feat_set)
#         test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status')
#         test_obj.trim_dates('01-01-2015', None)
#         test_obj.features_ = list(feat_set)
#         clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
#         clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
#         pred = test_obj.predict(clf)
#         test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
#         f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']])
#         print('features: {}'.format(feat_set))
#         print('    F1: {}'.format(f1_score))
Process ForkPoolWorker-10:
Process ForkPoolWorker-9:
Process ForkPoolWorker-12:
Process ForkPoolWorker-11:
Traceback (most recent call last):
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Traceback (most recent call last):
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 344, in day_prediction
    calc_all_window_metrics(day, window, meas_col=meas_col, model_col=model_col, overwrite=overwrite)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 62, in calc_all_window_metrics
    calc_line_length(df, meas_col, window, dx, label=label_ll_1, overwrite=overwrite)
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 286, in calc_line_length
    df[label] = df[col].rolling(window, center=True).apply(lambda x: line_length(x, dx)).fillna(0)
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 352, in day_prediction
    clear_model = day[y_pred][model_col]
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 1207, in apply
    return super(Rolling, self).apply(func, args=args, kwargs=kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 1958, in __getitem__
    return self._getitem_array(key)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 856, in apply
    center=False)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 801, in _apply
    result = calc(values)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 2000, in _getitem_array
    return self.take(indexer, axis=0, convert=False)
Traceback (most recent call last):
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/generic.py", line 1928, in take
    convert=True, verify=True)
Traceback (most recent call last):
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 795, in calc
    closed=self.closed)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 4011, in take
    axis=axis, allow_dups=True)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 3901, in reindex_indexer
    return self.__class__(new_blocks, new_axes)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/window.py", line 853, in f
    offset, func, args, kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 2797, in __init__
    self._consolidate_check()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
    y_pred = clf.predict(X)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 3305, in _consolidate_check
    ftypes = [blk.ftype for blk in self.blocks]
  File "pandas/_libs/window.pyx", line 1462, in pandas._libs.window.roll_generic (pandas/_libs/window.c:36229)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 3305, in <listcomp>
    ftypes = [blk.ftype for blk in self.blocks]
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
    proba = self.predict_proba(X)
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
    y_pred = clf.predict(X)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 309, in ftype
    return "%s:%s" % (self.dtype, self._ftype)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
    for e in self.estimators_)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
    self._terminate_backend()
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 286, in <lambda>
    df[label] = df[col].rolling(window, center=True).apply(lambda x: line_length(x, dx)).fillna(0)
KeyboardInterrupt
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 304, in line_length
    ydiffs = np.diff(x)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
    proba = self.predict_proba(X)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
    self._backend.terminate()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
    self._pool.terminate()  # terminate does a join()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
    for e in self.estimators_)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/numpy/lib/function_base.py", line 1926, in diff
    return a[slice1]-a[slice2]
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 768, in __call__
    self.retrieve()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
    self._terminate()
KeyboardInterrupt
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 719, in retrieve
    raise exception
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
    worker_handler.join()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 682, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 602, in get
    self.wait(timeout)
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
    self._wait_for_tstate_lock()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 599, in wait
    self._event.wait(timeout)
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 549, in wait
    signaled = self._cond.wait(timeout)
KeyboardInterrupt
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 293, in wait
    waiter.acquire()
KeyboardInterrupt
---------------------------------------------------------------------------
KeyboardInterrupt                         Traceback (most recent call last)
<ipython-input-32-627439ef4ce4> in <module>()
      9 clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42)
     10 clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50)
---> 11 pred = test_obj.predict(clf)
     12 test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50)
     13 f1_score = metrics.f1_score(test_obj.get_masked_df()['sky_status'], pred[test_obj.df['quality_mask']])

~/duramat/clearsky_detection/cs_detection.py in predict(self, *args, **kwargs)
    691 
    692     def predict(self, *args, **kwargs):
--> 693         return self.iter_predict_daily(*args, **kwargs)
    694 
    695     def downsample(self, min_freq):

~/duramat/clearsky_detection/cs_detection.py in iter_predict_daily(self, clf, n_iter, tol, ml_label, overwrite, multiproc, by_day)
    678                 )
    679             pool.close()
--> 680             pool.join()
    681         else:
    682             final_dfs = [utils.day_prediction(day, clf, self.features_, self.window, self.meas_col, self.model_col,

~/miniconda3/lib/python3.5/multiprocessing/pool.py in join(self)
    508         util.debug('joining pool')
    509         assert self._state in (CLOSE, TERMINATE)
--> 510         self._worker_handler.join()
    511         self._task_handler.join()
    512         self._result_handler.join()

~/miniconda3/lib/python3.5/threading.py in join(self, timeout)
   1052 
   1053         if timeout is None:
-> 1054             self._wait_for_tstate_lock()
   1055         else:
   1056             # the behavior of a negative timeout isn't documented, but

~/miniconda3/lib/python3.5/threading.py in _wait_for_tstate_lock(self, block, timeout)
   1068         if lock is None:  # already determined that the C code is done
   1069             assert self._is_stopped
-> 1070         elif lock.acquire(block, timeout):
   1071             lock.release()
   1072             self._stop()

KeyboardInterrupt: 
Process ForkPoolWorker-14:
Process ForkPoolWorker-13:
Process ForkPoolWorker-15:
Process ForkPoolWorker-16:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
Traceback (most recent call last):
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
Traceback (most recent call last):
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
    y_pred = clf.predict(X)
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
    y_pred = clf.predict(X)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
    proba = self.predict_proba(X)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
    proba = self.predict_proba(X)
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
    y_pred = clf.predict(X)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
    for e in self.estimators_)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
    proba = self.predict_proba(X)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
    for e in self.estimators_)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 249, in _bootstrap
    self.run()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
    for e in self.estimators_)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
    self._terminate_backend()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
    self._terminate_backend()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
    self._backend.terminate()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
    self._terminate_backend()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
    self._backend.terminate()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
    self._pool.terminate()  # terminate does a join()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
    self._backend.terminate()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
    self._pool.terminate()  # terminate does a join()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
    self._terminate()
  File "/Users/benellis/duramat/clearsky_detection/utils.py", line 347, in day_prediction
    y_pred = clf.predict(X)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
    self._terminate()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
    self._pool.terminate()  # terminate does a join()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
    self._terminate()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 534, in predict
    proba = self.predict_proba(X)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
    worker_handler.join()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/ensemble/forest.py", line 583, in predict_proba
    for e in self.estimators_)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
    worker_handler.join()
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
    self._wait_for_tstate_lock()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
    worker_handler.join()
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
    self._wait_for_tstate_lock()
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 776, in __call__
    self._terminate_backend()
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
    self._wait_for_tstate_lock()
KeyboardInterrupt
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 553, in _terminate_backend
    self._backend.terminate()
KeyboardInterrupt
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
KeyboardInterrupt
  File "/Users/benellis/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 134, in terminate
    self._pool.terminate()  # terminate does a join()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 505, in terminate
    self._terminate()
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/util.py", line 186, in __call__
    res = self._callback(*self._args, **self._kwargs)
  File "/Users/benellis/miniconda3/lib/python3.5/multiprocessing/pool.py", line 546, in _terminate_pool
    worker_handler.join()
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1054, in join
    self._wait_for_tstate_lock()
  File "/Users/benellis/miniconda3/lib/python3.5/threading.py", line 1070, in _wait_for_tstate_lock
    elif lock.acquire(block, timeout):
KeyboardInterrupt
In [ ]:
cm = metrics.confusion_matrix(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']])
visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))
In [ ]:
fig, ax = plt.subplots(figsize=(12, 8))

_ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_)
_ = ax.set_xticks(range(len(clf.feature_importances_)))
_ = ax.set_xticklabels(test_obj.features_, rotation=45)

_ = ax.set_ylabel('Importance')
_ = ax.set_xlabel('Feature')

_ = fig.tight_layout()
In [ ]:
# fig, ax = plt.subplots(figsize=(12, 8))

nsrdb_mask = test_obj.df['sky_status'].values

trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10})
trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10})
trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10})
# _ = ax.legend(bbox_to_anchor=(1.25, 1))

# _ = ax.set_xlabel('Date')
# _ = ax.set_ylabel('GHI / Wm$^{-2}$')
iplot([trace1, trace2, trace3, trace4, trace5])
Full features F1: 0.9866607381058248 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean') F1: 0.9856331440026729 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std') F1: 0.9577911055788164 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean') F1: 0.9385054192812322 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs std') F1: 0.9371298405466971 features: ('GHILL-GHIcsLL', 'abs(t-tnoon)') F1: 0.7715979316433346 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std') F1: 0.985423389340158 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean') F1: 0.9853745673774701 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std') F1: 0.9849649181423321 features: ('abs(GHI-GHIcs) mean', 'abs(t-tnoon)') F1: 0.9841906034290804 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean') F1: 0.9587466185752931 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs std') F1: 0.9558723693143246 features: ('abs(GHI-GHIcs) std', 'abs(t-tnoon)') F1: 0.8895328619450562 features: ('dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9494128274616079 features: ('dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.7775988105563127 features: ('dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9019607843137255 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std') F1: 0.9864263462394304 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean') F1: 0.9869608826479438 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std') F1: 0.985510477039679 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(t-tnoon)') F1: 0.9859563085153812 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean') F1: 0.9683237515499944 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std') F1: 0.9661573288058857 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)') F1: 0.9654549559720027 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9620510503727129 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9518619436875567 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9541180469015521 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean') F1: 0.9849749582637729 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std') F1: 0.985552344965548 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)') F1: 0.986 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.985087914533719 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9870881567230633 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9863044204431579 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9650979509119567 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9652144545761566 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9621938833088816 features: ('dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9511643680759666 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean') F1: 0.9868801423171003 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std') F1: 0.9853300733496334 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)') F1: 0.9868830591373944 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9862191598132919 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9873164218958611 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.986289153940475 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9702547985183522 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9708563069652302 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9679241021007453 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.966884433430953 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.986 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9862160960426856 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9852074296518742 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9860879243183084 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9699628754640567features = ['GHI-GHIcs mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)'] detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status') detect_obj.df.index = detect_obj.df.index.tz_convert('MST') train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') train_obj.trim_dates(None, '01-01-2015') train_obj.features_ = features test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') test_obj.trim_dates('01-01-2015', None) test_obj.features_ = features clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42) clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50) pred = test_obj.predict(clf) test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50) f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]) print(f1_score) nsrdb_mask = test_obj.df['sky_status'].values trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI') trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs') trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10}) trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10}) trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10}) # _ = ax.legend(bbox_to_anchor=(1.25, 1)) # _ = ax.set_xlabel('Date') # _ = ax.set_ylabel('GHI / Wm$^{-2}$') iplot([trace1, trace2, trace3, trace4, trace5])

CV scoring and model selection (2010-2015 only)

In [ ]:
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
In [ ]:
pred = detect_obj.predict(clf)
In [ ]:
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})

iplot([trace1, trace2, trace3])
detect_obj.df[['GHI', 'GHI mean', 'GHI gauss10 mean', 'GHI gauss1 mean', 'GHI gauss.1 mean']].iplot()
In [ ]:
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
In [ ]:
detect_obj.downsample(5)
In [ ]:
pred = detect_obj.predict(clf)
In [ ]:
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})

iplot([trace1, trace2, trace3])
In [ ]:
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
In [ ]:
detect_obj.downsample(10)
In [ ]:
pred = detect_obj.predict(clf)
In [ ]:
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})

iplot([trace1, trace2, trace3])
In [ ]:
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
In [ ]:
detect_obj.downsample(15)
In [ ]:
pred = detect_obj.predict(clf)
In [ ]:
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})

iplot([trace1, trace2, trace3])
In [ ]:
detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib')
detect_obj.df.index = detect_obj.df.index.tz_convert('MST')
detect_obj.trim_dates('10-01-2015', '11-01-2015')
detect_obj.features_ = features
In [ ]:
detect_obj.downsample(30)
In [ ]:
pred = detect_obj.predict(clf)
In [ ]:
trace1 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['GHI'], name='GHI')
trace2 = go.Scatter(x=detect_obj.df.index, y=detect_obj.df['Clearsky GHI pvlib'], name='GHIcs')
trace3 = go.Scatter(x=detect_obj.df[pred].index, y=detect_obj.df[pred]['GHI'], name='Clear', mode='markers', marker={'size': 10})

iplot([trace1, trace2, trace3])
In [ ]:
test_obj.df.index[-1]
In [ ]:
 
In [ ]:
 

SCRATCH

The previous two sections show promising results going from default to cleaned data. We see marked improvements in scoring and the confusion matrix. The features used in both cases are highly correlated, though. Here we will look at feature importances in a different way - by removing 1 at a time and recalculating the F1 score.

features = [ # 'GHILL-GHIcsLL', 'GHI-GHIcs mean', 'GHI-GHIcs std', 'dGHI-dGHIcs mean', # 'dGHI-dGHIcs std', 'abs(t-tnoon)']detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status') detect_obj.df.index = detect_obj.df.index.tz_convert('MST') train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') train_obj.trim_dates(None, '01-01-2015') train_obj.features_ = features test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') test_obj.trim_dates('01-01-2015', None) test_obj.features_ = features clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42) clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50) pred = test_obj.predict(clf) test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50) f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]) print('Full features'.format([i for i in features])) print(' F1: {}'.format(f1_score)) vis = visualize.Visualizer() vis.plot_corr_matrix(test_obj.df[test_obj.features_].corr().values, test_obj.features_) # for i in range(3, len(features) - 1): # for feat_set in itertools.combinations(features, i): # detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status') # detect_obj.df.index = detect_obj.df.index.tz_convert('MST') # train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') # train_obj.trim_dates(None, '01-01-2015') # train_obj.features_ = list(feat_set) # test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') # test_obj.trim_dates('01-01-2015', None) # test_obj.features_ = list(feat_set) # clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42) # clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50) # pred = test_obj.predict(clf) # test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50) # f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]) # print('features: {}'.format(feat_set)) # print(' F1: {}'.format(f1_score))cm = metrics.confusion_matrix(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]) visualize.plot_confusion_matrix2(cm, ('cloudy', 'clear'))fig, ax = plt.subplots(figsize=(12, 8)) _ = ax.bar(range(len(clf.feature_importances_)), clf.feature_importances_) _ = ax.set_xticks(range(len(clf.feature_importances_))) _ = ax.set_xticklabels(test_obj.features_, rotation=45) _ = ax.set_ylabel('Importance') _ = ax.set_xlabel('Feature') _ = fig.tight_layout()# fig, ax = plt.subplots(figsize=(12, 8)) nsrdb_mask = test_obj.df['sky_status'].values trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI') trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs') trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10}) trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10}) trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10}) # _ = ax.legend(bbox_to_anchor=(1.25, 1)) # _ = ax.set_xlabel('Date') # _ = ax.set_ylabel('GHI / Wm$^{-2}$') iplot([trace1, trace2, trace3, trace4, trace5])Full features F1: 0.9866607381058248 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean') F1: 0.9856331440026729 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std') F1: 0.9577911055788164 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean') F1: 0.9385054192812322 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs std') F1: 0.9371298405466971 features: ('GHILL-GHIcsLL', 'abs(t-tnoon)') F1: 0.7715979316433346 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std') F1: 0.985423389340158 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean') F1: 0.9853745673774701 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std') F1: 0.9849649181423321 features: ('abs(GHI-GHIcs) mean', 'abs(t-tnoon)') F1: 0.9841906034290804 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean') F1: 0.9587466185752931 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs std') F1: 0.9558723693143246 features: ('abs(GHI-GHIcs) std', 'abs(t-tnoon)') F1: 0.8895328619450562 features: ('dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9494128274616079 features: ('dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.7775988105563127 features: ('dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9019607843137255 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std') F1: 0.9864263462394304 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean') F1: 0.9869608826479438 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std') F1: 0.985510477039679 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(t-tnoon)') F1: 0.9859563085153812 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean') F1: 0.9683237515499944 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std') F1: 0.9661573288058857 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)') F1: 0.9654549559720027 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9620510503727129 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9518619436875567 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9541180469015521 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean') F1: 0.9849749582637729 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std') F1: 0.985552344965548 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)') F1: 0.986 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.985087914533719 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9870881567230633 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9863044204431579 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9650979509119567 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9652144545761566 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9621938833088816 features: ('dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9511643680759666 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean') F1: 0.9868801423171003 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std') F1: 0.9853300733496334 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'abs(t-tnoon)') F1: 0.9868830591373944 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9862191598132919 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9873164218958611 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.986289153940475 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.9702547985183522 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9708563069652302 features: ('GHILL-GHIcsLL', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9679241021007453 features: ('GHILL-GHIcsLL', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.966884433430953 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std') F1: 0.986 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'abs(t-tnoon)') F1: 0.9862160960426856 features: ('abs(GHI-GHIcs) mean', 'abs(GHI-GHIcs) std', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9852074296518742 features: ('abs(GHI-GHIcs) mean', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9860879243183084 features: ('abs(GHI-GHIcs) std', 'dGHI-dGHIcs mean', 'dGHI-dGHIcs std', 'abs(t-tnoon)') F1: 0.9699628754640567features = ['GHI-GHIcs mean', 'dGHI-dGHIcs mean', 'abs(t-tnoon)'] detect_obj = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz', 'GHI', 'Clearsky GHI pvlib', 'sky_status') detect_obj.df.index = detect_obj.df.index.tz_convert('MST') train_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') train_obj.trim_dates(None, '01-01-2015') train_obj.features_ = features test_obj = cs_detection.ClearskyDetection(detect_obj.df, 'GHI', 'Clearsky GHI pvlib', 'sky_status') test_obj.trim_dates('01-01-2015', None) test_obj.features_ = features clf = ensemble.RandomForestClassifier(n_jobs=-1, n_estimators=32, random_state=42) clf = train_obj.fit_model(clf, ratio_mean_val=0.95, diff_mean_val=50) pred = test_obj.predict(clf) test_obj.filter_labels(ratio_mean_val=0.95, diff_mean_val=50) f1_score = metrics.f1_score(test_obj.df[test_obj.df['mask']]['sky_status'], pred[test_obj.df['mask']]) print(f1_score) nsrdb_mask = test_obj.df['sky_status'].values trace1 = go.Scatter(x=test_obj.df.index, y=test_obj.df['GHI'], name='GHI') trace2 = go.Scatter(x=test_obj.df.index, y=test_obj.df['Clearsky GHI pvlib'], name='GHIcs') trace3 = go.Scatter(x=test_obj.df[nsrdb_mask & ~pred].index, y=test_obj.df[nsrdb_mask & ~pred]['GHI'], name='NSRDB only', mode='markers', marker={'size': 10}) trace4 = go.Scatter(x=test_obj.df[pred & ~nsrdb_mask].index, y=test_obj.df[pred & ~nsrdb_mask]['GHI'], name='RF only', mode='markers', marker={'size': 10}) trace5 = go.Scatter(x=test_obj.df[nsrdb_mask & pred].index, y=test_obj.df[nsrdb_mask & pred]['GHI'], name='Both', mode='markers', marker={'size': 10}) # _ = ax.legend(bbox_to_anchor=(1.25, 1)) # _ = ax.set_xlabel('Date') # _ = ax.set_ylabel('GHI / Wm$^{-2}$') iplot([trace1, trace2, trace3, trace4, trace5])